This example walks through creating a regression model using the Boston data frame from the MASS package.

library(MASS)
names(Boston)
 [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
 [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"   
str(Boston)
'data.frame':   506 obs. of  14 variables:
 $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
 $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
 $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
 $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
 $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
 $ rm     : num  6.58 6.42 7.18 7 7.15 ...
 $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
 $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
 $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
 $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
 $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
 $ black  : num  397 397 393 395 397 ...
 $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
 $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

We would like to create a model that predicts the median value of owner occupied homes (medv). Start by viewing the relationship between medv and the other variables in the Boston data frame. We will use the package car. If you do not have car installed on your machine, install it now.

library(car)  
scatterplotMatrix(~ medv + lstat + black + ptratio + tax + rad + dis + age + rm + nox + chas + indus + zn + crim, data = Boston)

Comment about scatter plot

Simple Linear Model

simple.lm <- lm(medv ~ lstat, data = Boston)
simple.lm

Call:
lm(formula = medv ~ lstat, data = Boston)

Coefficients:
(Intercept)        lstat  
      34.55        -0.95  
summary(simple.lm)

Call:
lm(formula = medv ~ lstat, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-15.168  -3.990  -1.318   2.034  24.500 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 34.55384    0.56263   61.41   <2e-16 ***
lstat       -0.95005    0.03873  -24.53   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 6.216 on 504 degrees of freedom
Multiple R-squared:  0.5441,    Adjusted R-squared:  0.5432 
F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
anova(simple.lm)
Analysis of Variance Table

Response: medv
           Df Sum Sq Mean Sq F value    Pr(>F)    
lstat       1  23244 23243.9  601.62 < 2.2e-16 ***
Residuals 504  19472    38.6                      
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Basic Regression Plots

par(mfrow=c(2, 2))
plot(simple.lm)

par(mfrow=c(1, 1))

Ggplot2 Graph

library(ggplot2)
ggplot(data = Boston, aes(x = lstat, y = medv)) +
  geom_point() + 
  theme_bw() + 
  stat_smooth(method = "lm")

Two Independent Variables

twoiv.lm <- lm(medv ~ lstat + rm, data = Boston)
summary(twoiv.lm)

Call:
lm(formula = medv ~ lstat + rm, data = Boston)

Residuals:
    Min      1Q  Median      3Q     Max 
-18.076  -3.516  -1.010   1.909  28.131 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) -1.35827    3.17283  -0.428    0.669    
lstat       -0.64236    0.04373 -14.689   <2e-16 ***
rm           5.09479    0.44447  11.463   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.54 on 503 degrees of freedom
Multiple R-squared:  0.6386,    Adjusted R-squared:  0.6371 
F-statistic: 444.3 on 2 and 503 DF,  p-value: < 2.2e-16

Plane of Best Fit

More Graphs Again

par(mfrow=c(2, 2))
plot(twoiv.lm)

par(mfrow=c(1, 1))

Forward and Backward Selection

The functions add1 and update can be used to create a model with forward selection.

SCOPE <- (~ . + crim + zn + indus + chas + nox + rm + age + dis + rad + tax + ptratio + black + lstat)
mod.fs <- lm(medv ~ 1, data = Boston)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ 1
        Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>               42716 2246.5                      
crim     1    6440.8 36276 2165.8  89.486 < 2.2e-16 ***
zn       1    5549.7 37167 2178.1  75.258 < 2.2e-16 ***
indus    1    9995.2 32721 2113.6 153.955 < 2.2e-16 ***
chas     1    1312.1 41404 2232.7  15.972 7.391e-05 ***
nox      1    7800.1 34916 2146.5 112.591 < 2.2e-16 ***
rm       1   20654.4 22062 1914.2 471.847 < 2.2e-16 ***
age      1    6069.8 36647 2171.0  83.478 < 2.2e-16 ***
dis      1    2668.2 40048 2215.9  33.580 1.207e-08 ***
rad      1    6221.1 36495 2168.9  85.914 < 2.2e-16 ***
tax      1    9377.3 33339 2123.1 141.761 < 2.2e-16 ***
ptratio  1   11014.3 31702 2097.6 175.106 < 2.2e-16 ***
black    1    4749.9 37966 2188.9  63.054 1.318e-14 ***
lstat    1   23243.9 19472 1851.0 601.618 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + lstat)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat
        Df Sum of Sq   RSS    AIC  F value    Pr(>F)    
<none>               19472 1851.0                       
crim     1     146.9 19325 1849.2   3.8246  0.051059 .  
zn       1     160.3 19312 1848.8   4.1758  0.041527 *  
indus    1      98.7 19374 1850.4   2.5635  0.109981    
chas     1     786.3 18686 1832.2  21.1665 5.336e-06 ***
nox      1       4.8 19468 1852.9   0.1239  0.724966    
rm       1    4033.1 15439 1735.6 131.3942 < 2.2e-16 ***
age      1     304.3 19168 1845.0   7.9840  0.004907 ** 
dis      1     772.4 18700 1832.5  20.7764 6.488e-06 ***
rad      1      25.1 19447 1852.4   0.6491  0.420799    
tax      1     274.4 19198 1845.8   7.1896  0.007574 ** 
ptratio  1    2670.1 16802 1778.4  79.9340 < 2.2e-16 ***
black    1     198.3 19274 1847.8   5.1764  0.023316 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + rm)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm
        Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>               15439 1735.6                      
crim     1    311.42 15128 1727.3 10.3341 0.0013900 ** 
zn       1     56.56 15383 1735.7  1.8457 0.1748999    
indus    1     61.09 15378 1735.6  1.9942 0.1585263    
chas     1    548.53 14891 1719.3 18.4921 2.051e-05 ***
nox      1     14.90 15424 1737.1  0.4849 0.4865454    
age      1     20.18 15419 1736.9  0.6571 0.4179577    
dis      1    351.15 15088 1725.9 11.6832 0.0006819 ***
rad      1    180.45 15259 1731.6  5.9367 0.0151752 *  
tax      1    425.16 15014 1723.5 14.2154 0.0001824 ***
ptratio  1   1711.32 13728 1678.1 62.5791 1.645e-14 ***
black    1    512.31 14927 1720.5 17.2290 3.892e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + ptratio)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio
       Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>              13728 1678.1                      
crim    1    122.52 13606 1675.6  4.5115 0.0341560 *  
zn      1     14.96 13713 1679.6  0.5467 0.4600162    
indus   1      0.83 13727 1680.1  0.0301 0.8622688    
chas    1    377.96 13350 1666.0 14.1841 0.0001854 ***
nox     1     24.81 13703 1679.2  0.9072 0.3413103    
age     1     66.24 13662 1677.7  2.4291 0.1197340    
dis     1    499.08 13229 1661.4 18.9009 1.668e-05 ***
rad     1      6.07 13722 1679.9  0.2218 0.6378931    
tax     1     44.36 13684 1678.5  1.6242 0.2031029    
black   1    389.68 13338 1665.6 14.6369 0.0001468 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + dis)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis
       Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>              13229 1661.4                      
crim    1    233.54 12995 1654.4  8.9856  0.002857 ** 
zn      1    144.81 13084 1657.8  5.5337  0.019040 *  
indus   1    242.65 12986 1654.0  9.3424  0.002359 ** 
chas    1    267.43 12962 1653.1 10.3162  0.001404 ** 
nox     1    759.56 12469 1633.5 30.4572 5.488e-08 ***
age     1     61.36 13168 1661.0  2.3300  0.127535    
rad     1     22.40 13206 1662.5  0.8481  0.357540    
tax     1    240.34 12989 1654.1  9.2519  0.002476 ** 
black   1    502.64 12726 1643.8 19.7482 1.089e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + nox)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox
       Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>              12469 1633.5                      
crim    1    141.43 12328 1629.7  5.7248 0.0170955 *  
zn      1    151.71 12318 1629.3  6.1461 0.0134998 *  
indus   1     17.10 12452 1634.8  0.6854 0.4081212    
chas    1    328.27 12141 1622.0 13.4920 0.0002655 ***
age     1      0.25 12469 1635.5  0.0098 0.9211095    
rad     1     53.48 12416 1633.3  2.1494 0.1432543    
tax     1     10.50 12459 1635.0  0.4205 0.5169877    
black   1    311.83 12158 1622.7 12.7991 0.0003806 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + chas)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox + chas
       Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>              12141 1622.0                      
crim    1   116.330 12025 1619.1  4.8178 0.0286287 *  
zn      1   164.406 11977 1617.1  6.8361 0.0092038 ** 
indus   1    26.274 12115 1622.9  1.0800 0.2991938    
age     1     2.331 12139 1623.9  0.0956 0.7572505    
rad     1    58.556 12082 1621.5  2.4135 0.1209286    
tax     1     4.187 12137 1623.8  0.1718 0.6786847    
black   1   272.837 11868 1612.5 11.4484 0.0007719 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + black)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black
       Df Sum of Sq   RSS    AIC F value   Pr(>F)   
<none>              11868 1612.5                    
crim    1    55.633 11813 1612.1  2.3407 0.126671   
zn      1   189.936 11678 1606.3  8.0832 0.004652 **
indus   1    15.584 11853 1613.8  0.6535 0.419264   
age     1     9.446 11859 1614.1  0.3959 0.529516   
rad     1   144.320 11724 1608.3  6.1180 0.013714 * 
tax     1     2.703 11866 1614.4  0.1132 0.736643   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + zn)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn
       Df Sum of Sq   RSS    AIC F value  Pr(>F)  
<none>              11678 1606.3                  
crim    1    94.712 11584 1604.2  4.0555 0.04457 *
indus   1    16.048 11662 1607.6  0.6825 0.40912  
age     1     1.491 11677 1608.2  0.0633 0.80138  
rad     1    93.614 11585 1604.2  4.0081 0.04583 *
tax     1     3.952 11674 1608.1  0.1679 0.68215  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + crim)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
    crim
       Df Sum of Sq   RSS    AIC F value   Pr(>F)   
<none>              11584 1604.2                    
indus   1    15.773 11568 1605.5  0.6750 0.411725   
age     1     2.470 11581 1606.1  0.1056 0.745387   
rad     1   228.604 11355 1596.1  9.9656 0.001692 **
tax     1     1.305 11582 1606.1  0.0558 0.813396   
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + rad)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
    crim + rad
       Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>              11355 1596.1                      
indus   1    33.894 11321 1596.6  1.4790 0.2245162    
age     1     0.096 11355 1598.1  0.0042 0.9485270    
tax     1   273.619 11081 1585.8 12.1978 0.0005214 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.fs <- update(mod.fs, . ~ . + tax)
add1(mod.fs, scope = SCOPE, test = "F")
Single term additions

Model:
medv ~ lstat + rm + ptratio + dis + nox + chas + black + zn + 
    crim + rad + tax
       Df Sum of Sq   RSS    AIC F value Pr(>F)
<none>              11081 1585.8               
indus   1   2.51754 11079 1587.7  0.1120 0.7380
age     1   0.06271 11081 1587.8  0.0028 0.9579
summary(mod.fs)

Call:
lm(formula = medv ~ lstat + rm + ptratio + dis + nox + chas + 
    black + zn + crim + rad + tax, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.5984  -2.7386  -0.5046   1.7273  26.2373 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
rm            3.801579   0.406316   9.356  < 2e-16 ***
ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
dis          -1.492711   0.185731  -8.037 6.84e-15 ***
nox         -17.376023   3.535243  -4.915 1.21e-06 ***
chas          2.718716   0.854240   3.183 0.001551 ** 
black         0.009291   0.002674   3.475 0.000557 ***
zn            0.045845   0.013523   3.390 0.000754 ***
crim         -0.108413   0.032779  -3.307 0.001010 ** 
rad           0.299608   0.063402   4.726 3.00e-06 ***
tax          -0.011778   0.003372  -3.493 0.000521 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.736 on 494 degrees of freedom
Multiple R-squared:  0.7406,    Adjusted R-squared:  0.7348 
F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

Backward Elimination

Backward elimination starts with all of the variables in the model and removes the least significant variable one at a time.

mod.be <- lm(medv ~ . , data = Boston)
drop1(mod.be, test = "F")  # single term deletions
Single term deletions

Model:
medv ~ crim + zn + indus + chas + nox + rm + age + dis + rad + 
    tax + ptratio + black + lstat
        Df Sum of Sq   RSS    AIC  F value    Pr(>F)    
<none>               11079 1589.6                       
crim     1    243.22 11322 1598.6  10.8012 0.0010868 ** 
zn       1    257.49 11336 1599.3  11.4351 0.0007781 ***
indus    1      2.52 11081 1587.8   0.1118 0.7382881    
chas     1    218.97 11298 1597.5   9.7243 0.0019250 ** 
nox      1    487.16 11566 1609.4  21.6342 4.246e-06 ***
rm       1   1871.32 12950 1666.6  83.1040 < 2.2e-16 ***
age      1      0.06 11079 1587.7   0.0027 0.9582293    
dis      1   1232.41 12311 1641.0  54.7305 6.013e-13 ***
rad      1    479.15 11558 1609.1  21.2788 5.071e-06 ***
tax      1    242.26 11321 1598.6  10.7585 0.0011116 ** 
ptratio  1   1194.23 12273 1639.4  53.0350 1.309e-12 ***
black    1    270.63 11349 1599.8  12.0187 0.0005729 ***
lstat    1   2410.84 13490 1687.3 107.0634 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, . ~ . - age)
drop1(mod.be, test = "F")  # single term deletions
Single term deletions

Model:
medv ~ crim + zn + indus + chas + nox + rm + dis + rad + tax + 
    ptratio + black + lstat
        Df Sum of Sq   RSS    AIC  F value    Pr(>F)    
<none>               11079 1587.7                       
crim     1    243.20 11322 1596.6  10.8221 0.0010747 ** 
zn       1    260.32 11339 1597.4  11.5841 0.0007194 ***
indus    1      2.52 11081 1585.8   0.1120 0.7379887    
chas     1    219.91 11299 1595.6   9.7859 0.0018626 ** 
nox      1    520.87 11600 1608.9  23.1781 1.967e-06 ***
rm       1   1959.55 13038 1668.0  87.1987 < 2.2e-16 ***
dis      1   1352.26 12431 1643.9  60.1743 5.028e-14 ***
rad      1    481.09 11560 1607.2  21.4083 4.751e-06 ***
tax      1    242.24 11321 1596.6  10.7796 0.0010991 ** 
ptratio  1   1200.23 12279 1637.7  53.4092 1.099e-12 ***
black    1    272.26 11351 1597.9  12.1154 0.0005445 ***
lstat    1   2718.88 13798 1696.7 120.9879 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, . ~ . - indus)
drop1(mod.be, test = "F")  # single term deletions
Single term deletions

Model:
medv ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + 
    black + lstat
        Df Sum of Sq   RSS    AIC F value    Pr(>F)    
<none>               11081 1585.8                      
crim     1    245.37 11327 1594.8  10.939 0.0010104 ** 
zn       1    257.82 11339 1595.4  11.494 0.0007543 ***
chas     1    227.21 11309 1594.0  10.129 0.0015515 ** 
nox      1    541.91 11623 1607.9  24.158 1.209e-06 ***
rm       1   1963.66 13045 1666.3  87.539 < 2.2e-16 ***
dis      1   1448.94 12530 1645.9  64.593 6.837e-15 ***
rad      1    500.92 11582 1606.1  22.331 2.997e-06 ***
tax      1    273.62 11355 1596.1  12.198 0.0005214 ***
ptratio  1   1206.45 12288 1636.0  53.783 9.235e-13 ***
black    1    270.82 11352 1596.0  12.073 0.0005566 ***
lstat    1   2723.48 13805 1695.0 121.411 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
mod.be <- update(mod.be, . ~ . - age)
summary(mod.be)

Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
    tax + ptratio + black + lstat, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-15.5984  -2.7386  -0.5046   1.7273  26.2373 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
crim         -0.108413   0.032779  -3.307 0.001010 ** 
zn            0.045845   0.013523   3.390 0.000754 ***
chas          2.718716   0.854240   3.183 0.001551 ** 
nox         -17.376023   3.535243  -4.915 1.21e-06 ***
rm            3.801579   0.406316   9.356  < 2e-16 ***
dis          -1.492711   0.185731  -8.037 6.84e-15 ***
rad           0.299608   0.063402   4.726 3.00e-06 ***
tax          -0.011778   0.003372  -3.493 0.000521 ***
ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
black         0.009291   0.002674   3.475 0.000557 ***
lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.736 on 494 degrees of freedom
Multiple R-squared:  0.7406,    Adjusted R-squared:  0.7348 
F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16
## Criterion Based Selection
library(leaps)
models <- regsubsets(medv ~ ., data = Boston, nvmax = 11)
summary(models)
Subset selection object
Call: regsubsets.formula(medv ~ ., data = Boston, nvmax = 11)
13 Variables  (and intercept)
        Forced in Forced out
crim        FALSE      FALSE
zn          FALSE      FALSE
indus       FALSE      FALSE
chas        FALSE      FALSE
nox         FALSE      FALSE
rm          FALSE      FALSE
age         FALSE      FALSE
dis         FALSE      FALSE
rad         FALSE      FALSE
tax         FALSE      FALSE
ptratio     FALSE      FALSE
black       FALSE      FALSE
lstat       FALSE      FALSE
1 subsets of each size up to 11
Selection Algorithm: exhaustive
          crim zn  indus chas nox rm  age dis rad tax ptratio black lstat
1  ( 1 )  " "  " " " "   " "  " " " " " " " " " " " " " "     " "   "*"  
2  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " " "     " "   "*"  
3  ( 1 )  " "  " " " "   " "  " " "*" " " " " " " " " "*"     " "   "*"  
4  ( 1 )  " "  " " " "   " "  " " "*" " " "*" " " " " "*"     " "   "*"  
5  ( 1 )  " "  " " " "   " "  "*" "*" " " "*" " " " " "*"     " "   "*"  
6  ( 1 )  " "  " " " "   "*"  "*" "*" " " "*" " " " " "*"     " "   "*"  
7  ( 1 )  " "  " " " "   "*"  "*" "*" " " "*" " " " " "*"     "*"   "*"  
8  ( 1 )  " "  "*" " "   "*"  "*" "*" " " "*" " " " " "*"     "*"   "*"  
9  ( 1 )  "*"  " " " "   "*"  "*" "*" " " "*" "*" " " "*"     "*"   "*"  
10  ( 1 ) "*"  "*" " "   " "  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
11  ( 1 ) "*"  "*" " "   "*"  "*" "*" " " "*" "*" "*" "*"     "*"   "*"  
R2adj <- summary(models)$adjr2
R2adj
 [1] 0.5432418 0.6371245 0.6767036 0.6878351 0.7051702 0.7123567 0.7182560
 [8] 0.7222072 0.7252743 0.7299149 0.7348058
which.max(R2adj)
[1] 11
MCP <- summary(models)$cp  # Mallow's C_p
MCP
 [1] 362.75295 185.64743 111.64889  91.48526  59.75364  47.17537  37.05889
 [8]  30.62398  25.86592  18.20493  10.11455
BIC <- summary(models)$bic
BIC
 [1] -385.0521 -496.2582 -549.4767 -561.9884 -585.6823 -592.9553 -598.2295
 [8] -600.1663 -600.5767 -603.9917 -608.0353
which.min(BIC)
[1] 11
which.min(MCP)
[1] 11

Comment on model selected with \(R^2_{adj}\), Mallow’s \(C_p\), and the Bayesian Information Criterion (BIC).

plot(models, scale = "adjr2")
plot(models, scale = "Cp")
plot(models, scale = "bic")

Diagnostic Plots Again

par(mfrow=c(2, 2))
plot(mod.be)

par(mfrow=c(1, 1))

More Diagnostic Stuff

library(car)
influenceIndexPlot(mod.be, id.n = 3)

outlierTest(mod.be)
    rstudent unadjusted p-value Bonferonni p
369 5.893600         7.0113e-09   3.5477e-06
372 5.500418         6.0950e-08   3.0841e-05
373 5.325354         1.5341e-07   7.7626e-05
influencePlot(mod.be)

       StudRes        Hat     CookD
369  5.8935999 0.05615812 0.4015156
381 -0.9956297 0.30484091 0.1903293
InflMea <- influence.measures(mod.be)
summary(InflMea)  # Which observations are influential
Potentially influential observations of
     lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +      tax + ptratio + black + lstat, data = Boston) :

    dfb.1_ dfb.crim dfb.zn dfb.chas dfb.nox dfb.rm  dfb.dis dfb.rad
142 -0.10  -0.07     0.12   0.00    -0.06    0.06   -0.12   -0.16  
143  0.01   0.00     0.00  -0.04    -0.03    0.00   -0.01    0.01  
146 -0.02  -0.01     0.00  -0.01     0.05    0.02    0.02   -0.02  
147  0.00   0.00     0.00   0.00    -0.01    0.00    0.00    0.00  
152  0.00   0.00     0.00  -0.01     0.05   -0.02    0.01   -0.01  
153 -0.04  -0.02    -0.02  -0.16    -0.16    0.13   -0.04    0.04  
156  0.01  -0.01     0.00  -0.16    -0.14    0.02   -0.06    0.08  
157 -0.01   0.00     0.00   0.00    -0.02    0.01    0.00    0.01  
160  0.02  -0.01     0.00   0.02    -0.09    0.00   -0.03    0.02  
162  0.04   0.05    -0.13  -0.09     0.01    0.10   -0.06   -0.16  
163 -0.03   0.07    -0.11   0.31    -0.02    0.13    0.00   -0.16  
167 -0.06   0.05    -0.15  -0.09     0.01    0.24   -0.02   -0.18  
187 -0.11   0.03    -0.08  -0.08     0.00    0.25   -0.07    0.06  
215  0.08  -0.10    -0.02  -0.01    -0.19    0.04   -0.06    0.06  
229 -0.03  -0.01    -0.10  -0.06    -0.03    0.16   -0.02    0.06  
234 -0.13   0.00    -0.13  -0.07    -0.01    0.28    0.03    0.04  
254 -0.43   0.06    -0.25  -0.02     0.17    0.44    0.46   -0.07  
358  0.02   0.01     0.00  -0.04    -0.02    0.00   -0.01   -0.01  
359 -0.01   0.00     0.00   0.02     0.01    0.00    0.01    0.00  
365  0.58   0.08     0.07  -0.53    -0.21   -0.59   -0.17    0.04  
366  0.51  -0.12     0.16  -0.05     0.11   -0.84   -0.18    0.25  
368  0.56  -0.01     0.11   0.00    -0.14   -0.64   -0.23    0.15  
369  0.76  -0.19     0.19  -0.11    -0.11   -1.06_* -0.49    0.39  
370  0.06  -0.05     0.04   0.62    -0.09   -0.12   -0.16    0.09  
371 -0.02  -0.02     0.03   0.54    -0.07   -0.02   -0.14    0.06  
372  0.18  -0.06     0.09  -0.10    -0.14   -0.20   -0.32    0.21  
373  0.23  -0.03     0.14   0.88    -0.08   -0.40   -0.24    0.15  
375  0.16   0.02     0.11   0.00    -0.18   -0.18   -0.13    0.16  
381  0.12  -0.64     0.05  -0.01    -0.05   -0.10   -0.06    0.16  
399  0.01  -0.06     0.00   0.00     0.00   -0.01    0.00    0.00  
402  0.10  -0.03    -0.02   0.04    -0.02   -0.09    0.01   -0.06  
405  0.00   0.05     0.00   0.00     0.00    0.00    0.00    0.00  
406  0.03  -0.30     0.01  -0.01    -0.02    0.00   -0.02    0.06  
411  0.00  -0.01     0.00   0.00     0.00    0.00    0.00    0.00  
413  0.43  -0.03     0.08   0.04    -0.41   -0.21   -0.21    0.12  
415  0.14   0.47     0.03   0.04    -0.08   -0.11    0.01   -0.05  
419  0.01   0.28    -0.02   0.01     0.01    0.00    0.02   -0.07  
428 -0.02  -0.11     0.01   0.00    -0.01    0.01   -0.01    0.03  
489 -0.02   0.00    -0.01   0.01    -0.04    0.00   -0.02   -0.17  
490  0.01   0.01     0.00   0.00     0.02   -0.01    0.01    0.06  
491 -0.02  -0.03    -0.01   0.02    -0.08    0.02   -0.03   -0.23  
492  0.00   0.00     0.00   0.00     0.00    0.00    0.00    0.00  
493 -0.04   0.00    -0.02   0.01    -0.03    0.02   -0.01   -0.23  
506  0.05  -0.03    -0.08   0.03    -0.10    0.06    0.10    0.08  
    dfb.tax dfb.ptrt dfb.blck dfb.lstt dffit   cov.r   cook.d hat    
142  0.07    0.12     0.09     0.32     0.48_*  0.95    0.02   0.04  
143  0.00    0.01    -0.01    -0.02    -0.07    1.09_*  0.00   0.07  
146  0.00   -0.02    -0.03     0.03     0.09    1.08_*  0.00   0.05  
147  0.00    0.00     0.00     0.00    -0.01    1.08_*  0.00   0.05  
152  0.00   -0.01     0.00    -0.02     0.06    1.07_*  0.00   0.05  
153  0.00    0.04     0.02     0.11    -0.30    1.08_*  0.01   0.08_*
156 -0.01    0.02     0.15     0.05    -0.30    1.09_*  0.01   0.08_*
157  0.00    0.00     0.02     0.01    -0.03    1.10_*  0.00   0.07  
160  0.00    0.01    -0.01     0.04    -0.11    1.07_*  0.00   0.05  
162  0.19   -0.19     0.01    -0.17     0.46    0.86_*  0.02   0.02  
163  0.18   -0.10     0.01    -0.07     0.47_*  0.97    0.02   0.05  
167  0.20   -0.17     0.02    -0.06     0.47_*  0.87_*  0.02   0.03  
187 -0.15    0.06     0.01     0.02     0.42    0.84_*  0.01   0.02  
215 -0.10   -0.07     0.00     0.40     0.49_*  0.89_*  0.02   0.03  
229 -0.05   -0.03     0.00    -0.02     0.29    0.91_*  0.01   0.01  
234 -0.04   -0.01     0.02     0.04     0.37    0.91_*  0.01   0.02  
254  0.08    0.11     0.05     0.11     0.63_*  0.89_*  0.03   0.05  
358  0.00   -0.01    -0.01     0.01    -0.05    1.07_*  0.00   0.05  
359  0.00    0.00     0.00    -0.01     0.02    1.08_*  0.00   0.05  
365 -0.14   -0.26    -0.09    -0.01    -0.98_*  0.83_*  0.08   0.07_*
366 -0.10   -0.04    -0.01    -0.68     0.95_*  0.92_*  0.07   0.09_*
368 -0.05   -0.08    -0.32    -0.44     0.75_*  0.92_*  0.05   0.07_*
369  0.00   -0.07     0.06    -1.10_*   1.44_*  0.48_*  0.16   0.06  
370  0.13    0.10     0.06    -0.37     0.87_*  0.76_*  0.06   0.05  
371  0.13    0.11     0.09    -0.30     0.78_*  0.82_*  0.05   0.05  
372  0.06    0.01     0.15    -0.38     0.73_*  0.51_*  0.04   0.02  
373  0.08    0.11     0.02    -0.49     1.17_*  0.55_*  0.11   0.05  
375 -0.07   -0.12     0.23     0.32     0.62_*  0.89_*  0.03   0.05  
381 -0.03   -0.03    -0.15     0.06    -0.66_*  1.44_*  0.04   0.30_*
399  0.00    0.00    -0.03    -0.02    -0.08    1.07_*  0.00   0.05  
402  0.00   -0.02    -0.16    -0.08    -0.27    0.92_*  0.01   0.01  
405  0.00    0.00     0.01     0.01     0.05    1.08_*  0.00   0.05  
406  0.00   -0.01    -0.08     0.02    -0.32    1.20_*  0.01   0.16_*
411  0.00    0.00     0.01     0.01    -0.01    1.16_*  0.00   0.12_*
413 -0.05   -0.15    -0.49     0.28     0.84_*  0.80_*  0.06   0.05  
415 -0.06   -0.05    -0.18     0.15     0.69_*  0.95    0.04   0.07_*
419  0.00    0.01    -0.07    -0.05     0.30    1.25_*  0.01   0.19_*
428  0.00   -0.01     0.10     0.06    -0.17    1.08_*  0.00   0.06  
489  0.19    0.01     0.02     0.00     0.21    1.09_*  0.00   0.07_*
490 -0.06    0.00     0.00    -0.01    -0.07    1.11_*  0.00   0.07_*
491  0.24    0.00    -0.01     0.09     0.29    1.09_*  0.01   0.08_*
492  0.00    0.00     0.00     0.00    -0.01    1.11_*  0.00   0.07_*
493  0.24    0.02     0.02    -0.03     0.26    1.08_*  0.01   0.07_*
506  0.05   -0.22     0.01     0.14    -0.32    0.93_*  0.01   0.02  

Transformations

boxCox(mod.be)

boxCox(mod.be, lambda = seq(-0.1, 0.3, 0.01))

mod.log <- lm(log(medv) ~ crim + zn + chas + nox + rm + dis + rad + tax + ptratio + black + lstat, data = Boston)
boxCox(mod.log)

par(mfrow=c(2, 2))
plot(mod.log)

par(mfrow=c(1, 1))
plot(mod.log, which = 1)

library(car)
residualPlots(mod.log)

           Test stat Pr(>|t|)
crim           2.219    0.027
zn             1.530    0.127
chas           0.973    0.331
nox           -0.632    0.527
rm             8.289    0.000
dis            4.071    0.000
rad           -0.777    0.437
tax           -0.125    0.901
ptratio        1.855    0.064
black         -2.732    0.007
lstat          5.857    0.000
Tukey test     5.327    0.000
mod2 <- lm(medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis + rad + tax + ptratio + black + lstat + I(lstat^2), data = Boston)
summary(mod.log)

Call:
lm(formula = log(medv) ~ crim + zn + chas + nox + rm + dis + 
    rad + tax + ptratio + black + lstat, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.73400 -0.09460 -0.01771  0.09782  0.86290 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  4.0836823  0.2030491  20.112  < 2e-16 ***
crim        -0.0103187  0.0013134  -7.856 2.49e-14 ***
zn           0.0010874  0.0005418   2.007 0.045308 *  
chas         0.1051484  0.0342285   3.072 0.002244 ** 
nox         -0.7217440  0.1416535  -5.095 4.97e-07 ***
rm           0.0906728  0.0162807   5.569 4.20e-08 ***
dis         -0.0517059  0.0074420  -6.948 1.18e-11 ***
rad          0.0134457  0.0025405   5.293 1.82e-07 ***
tax         -0.0005579  0.0001351  -4.129 4.28e-05 ***
ptratio     -0.0374259  0.0051715  -7.237 1.77e-12 ***
black        0.0004127  0.0001071   3.852 0.000133 ***
lstat       -0.0286039  0.0019002 -15.053  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 0.1898 on 494 degrees of freedom
Multiple R-squared:  0.7891,    Adjusted R-squared:  0.7844 
F-statistic: 168.1 on 11 and 494 DF,  p-value: < 2.2e-16
summary(mod2)

Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis + 
    rad + tax + ptratio + black + lstat + I(lstat^2), data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-26.4974  -2.2978  -0.1797   1.8022  26.4403 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 116.316659   9.262352  12.558  < 2e-16 ***
crim         -0.151178   0.027816  -5.435 8.64e-08 ***
zn            0.020712   0.011587   1.788 0.074468 .  
chas          2.425973   0.718922   3.374 0.000798 ***
nox         -14.539960   2.995854  -4.853 1.63e-06 ***
rm          -21.743779   2.784329  -7.809 3.49e-14 ***
I(rm^2)       1.963233   0.217161   9.040  < 2e-16 ***
dis          -1.164971   0.158153  -7.366 7.46e-13 ***
rad           0.236699   0.053534   4.421 1.21e-05 ***
tax          -0.008668   0.002846  -3.046 0.002443 ** 
ptratio      -0.690526   0.110027  -6.276 7.63e-10 ***
black         0.007000   0.002255   3.104 0.002022 ** 
lstat        -1.248991   0.119729 -10.432  < 2e-16 ***
I(lstat^2)    0.020466   0.003275   6.250 8.93e-10 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 3.984 on 492 degrees of freedom
Multiple R-squared:  0.8172,    Adjusted R-squared:  0.8123 
F-statistic: 169.2 on 13 and 492 DF,  p-value: < 2.2e-16
boxCox(mod2)

mod3 <- lm(medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis + rad + tax + ptratio + black + lstat, data = Boston)
summary(mod3)

Call:
lm(formula = medv ~ crim + zn + chas + nox + rm + I(rm^2) + dis + 
    rad + tax + ptratio + black + lstat, data = Boston)

Residuals:
     Min       1Q   Median       3Q      Max 
-28.1291  -2.2462  -0.2559   1.6153  26.9025 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept) 135.190606   9.087840  14.876  < 2e-16 ***
crim         -0.131174   0.028677  -4.574 6.06e-06 ***
zn            0.033005   0.011851   2.785  0.00556 ** 
chas          2.443193   0.746149   3.274  0.00113 ** 
nox         -16.784004   3.086920  -5.437 8.53e-08 ***
rm          -28.796902   2.641760 -10.901  < 2e-16 ***
I(rm^2)       2.540282   0.203998  12.452  < 2e-16 ***
dis          -1.176651   0.164132  -7.169 2.78e-12 ***
rad           0.240531   0.055558   4.329 1.81e-05 ***
tax          -0.009542   0.002950  -3.235  0.00130 ** 
ptratio      -0.740025   0.113898  -6.497 2.00e-10 ***
black         0.007221   0.002340   3.085  0.00215 ** 
lstat        -0.543575   0.041440 -13.117  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 4.135 on 493 degrees of freedom
Multiple R-squared:  0.8027,    Adjusted R-squared:  0.7979 
F-statistic: 167.1 on 12 and 493 DF,  p-value: < 2.2e-16
boxCox(mod3)